import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsTransformer, KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score, KFold
from sklearn.metrics import confusion_matrix
from sklearn.datasets import load_breast_cancer
from sklearn.pipeline import Pipeline
# Visualisation libraries
## Text
from colorama import Fore, Back, Style
from IPython.display import Image, display, Markdown, Latex
## seaborn
import seaborn as sns
sns.set_context("paper", rc={"font.size":12,"axes.titlesize":14,"axes.labelsize":12})
sns.set_style("white")
## matplotlib
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse, Polygon
import matplotlib.gridspec as gridspec
import matplotlib.colors
plt.style.use('seaborn-whitegrid')
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['text.color'] = 'k'
%matplotlib inline
## plotly
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import plotly.offline as py
from plotly.subplots import make_subplots
import plotly.express as px
# Graphics in retina format
%config InlineBackend.figure_format = 'retina'
import warnings
warnings.filterwarnings("ignore")

In this article, we compare a number of classificaiton method for the breast cancer dataset. The details regarding this dataset can be found in Diagnostic Wisconsin Breast Cancer Database. We would use the following classification methods and then compare them in terms of performance.
Throughout this website, there are a large number of methods that discuss these methods. Here, we will not discuss these methods and only apply them. Interested readers are encouraged to see Statistical Learning.
data = load_breast_cancer()
df = pd.DataFrame(data['data'], columns = data['feature_names'])
Temp = [x.title() for x in data['target_names'].tolist()]
df['Target'] = data['target']
df['Diagnosis'] = df['Target'].map(lambda x: Temp[1] if x == 1 else Temp[0])
del Temp
df
As can be seen, the number of instances is 569 and the number of attributes is 32. The object of the exercise is to create a classification model that can classify the type of Diagnosis base on the rest of the attributes. However, first, let's plot a count plot for Diagnosis attribute.
Temp = df.groupby(['Diagnosis'])['Diagnosis'].agg({'count'}).reset_index(drop = False).rename(columns ={'count': 'Count'})
Temp['Percentage'] = np.round(100* Temp['Count'].values /Temp['Count'].sum(), 2)
# display(Temp.style.hide_index())
fig = px.bar(Temp, y= 'Diagnosis', x= 'Percentage', orientation='h', text = 'Count', color_discrete_sequence= ['Bisque'],
height= 220)
fig.update_traces(marker_line_color= 'DarkRed', marker_line_width=1.5, opacity=1)
fig.update_traces(texttemplate='%{text:.2}', textposition='inside')
fig.update_layout(uniformtext_minsize= 8, uniformtext_mode='hide')
fig['layout']['xaxis'].update(range=[0, 100])
fig.update_layout(title = 'Diagnosis Distribution', plot_bgcolor= 'white')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.show()
X, y = data.data, data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
pd.DataFrame(data={'Set':['X_train','X_test','y_train','y_test'],
'Shape':[X_train.shape, X_test.shape, y_train.shape, y_test.shape]}).set_index('Set').T
# Neighbors List
n_neighbors_list = list(np.arange(1,11,1))
# Transforming X into a (weighted) graph of k nearest neighbors
graph_model = KNeighborsTransformer(n_neighbors=max(n_neighbors_list), mode='distance')
# KNeighborsClassifier
classifier_model = KNeighborsClassifier(metric='precomputed')
# Making a pipline
full_model = Pipeline(steps=[('graph', graph_model), ('classifier', classifier_model)])
# Paramter Grid
param_grid = {'classifier__n_neighbors': n_neighbors_list}
# Searching over specified parameter values for an estimator.
grid_model = GridSearchCV(full_model, param_grid)
# Fitting
_ = grid_model.fit(X_train, y_train)
display(pd.DataFrame({'Best Score': [grid_model.best_score_],
'Best Paramerers': [str(grid_model.best_params_)],
'Accuracy': [grid_model.score(X_test,y_test)]}).round(4).style.hide_index())
display(pd.DataFrame(grid_model.cv_results_)[['rank_test_score',
'params','mean_test_score']].sort_values(by=['rank_test_score']).style.hide_index()\
.background_gradient(cmap='YlGn', subset=['mean_test_score']).set_precision(4))
# Plot the results of the grid search.
fig, ax = plt.subplots(1, 2, figsize=(12.5, 6.5))
# Left
_ = ax[0].errorbar(x=n_neighbors_list,
y=grid_model.cv_results_['mean_test_score'],
yerr=grid_model.cv_results_['std_test_score'], uplims=True, lolims=True)
_ = ax[0].set(xlabel='n_neighbors', title='Classification accuracy')
# Right
_ = ax[1].errorbar(x=n_neighbors_list,
y=grid_model.cv_results_['mean_fit_time'],
yerr=grid_model.cv_results_['std_fit_time'], color='r', uplims=True, lolims=True)
_ = ax[1].set(xlabel='n_neighbors', title='Fit time (with caching)')
fig.tight_layout()
del graph_model, classifier_model, full_model, param_grid, ax
See sklearn.linear_model.LogisticRegression for more details.
# regularization strength
Regularization_Strength = [10.0**x for x in range(4)]
# Inverse of regularization strength
C = [1/x for x in Regularization_Strength]
# Parameters
param_grid = {'tol': [10.0**x for x in np.arange(-2, -5, -1)], 'C': C,}
# Logistic Regression
logistic = LogisticRegression(max_iter=10000)
# Searching over specified parameter values for an estimator.
grid_model = GridSearchCV(logistic, param_grid, n_jobs=-1)
# Fitting
_ = grid_model.fit(X_train, y_train)
display(pd.DataFrame({'Best Score': [grid_model.best_score_],
'Best Paramerers': [str(grid_model.best_params_)],
'Accuracy': [grid_model.score(X_test,y_test)]}).round(4).style.hide_index())
display(pd.DataFrame(grid_model.cv_results_)[['rank_test_score',
'params','mean_test_score']].sort_values(by=['rank_test_score']).style.hide_index()\
.background_gradient(cmap='YlGn', subset=['mean_test_score']).set_precision(4))
Temp = [str(x) for x in grid_model.cv_results_['params']]
Temp = [s.replace('{', '') for s in Temp]
Temp = [s.replace('}', '') for s in Temp]
Temp = [s.replace("'", '') for s in Temp]
# Plot the results of the grid search.
fig, ax = plt.subplots(1, 2, figsize=(12.5, 7))
# left
_ = ax[0].errorbar(x= Temp,
y=grid_model.cv_results_['mean_test_score'],
yerr=grid_model.cv_results_['std_test_score'], uplims=True, lolims=True)
_ = ax[0].set(xlabel='Paramerers', title='Classification accuracy')
_ = ax[0].set_xticklabels(labels = Temp, rotation=90, fontsize = 10)
# Right
_ = ax[1].errorbar(x= Temp,
y=grid_model.cv_results_['mean_fit_time'],
yerr=grid_model.cv_results_['std_fit_time'], color='r', uplims=True, lolims=True)
_ = ax[1].set(xlabel='Paramerers', title='Fit time (with caching)')
_ = ax[1].set_xticklabels(labels = Temp, rotation=90, fontsize = 10)
fig.tight_layout()
del Temp, grid_model, logistic
param_grid = {'pca__n_components': [2, 5, 10, 15, 25, 30], 'logistic__C': np.logspace(-4, 4, 4),}
# Logistic Regression
logistic = LogisticRegression(max_iter=10000, tol=0.1)
# Principle Component Analysis
pca = PCA()
# Making a pipline
full_model = Pipeline(steps=[('pca', pca), ('logistic', logistic)])
# Searching over specified parameter values for an estimator.
grid_model = GridSearchCV(full_model, param_grid, n_jobs=-1)
# Fitting
_ = grid_model.fit(X_train, y_train)
display(pd.DataFrame({'Best Score': [grid_model.best_score_],
'Best Paramerers': [str(grid_model.best_params_)],
'Accuracy': [grid_model.score(X_test,y_test)]}).round(4).style.hide_index())
display(pd.DataFrame(grid_model.cv_results_)[['rank_test_score',
'params','mean_test_score']].sort_values(by=['rank_test_score']).style.hide_index()\
.background_gradient(cmap='YlGn', subset=['mean_test_score']).set_precision(4))
Temp = [str(x) for x in grid_model.cv_results_['params']]
Temp = [s.replace('{', '') for s in Temp]
Temp = [s.replace('}', '') for s in Temp]
Temp = [s.replace("'", '') for s in Temp]
# Plot the results of the grid search.
fig, ax = plt.subplots(1, 2, figsize=(12.5, 10))
# left
_ = ax[0].errorbar(x= Temp,
y=grid_model.cv_results_['mean_test_score'],
yerr=grid_model.cv_results_['std_test_score'], uplims=True, lolims=True)
_ = ax[0].set(xlabel='Paramerers', title='Classification accuracy')
_ = ax[0].set_xticklabels(labels = Temp, rotation=90, fontsize = 10)
# Right
_ = ax[1].errorbar(x= Temp,
y=grid_model.cv_results_['mean_fit_time'],
yerr=grid_model.cv_results_['std_fit_time'], color='r', uplims=True, lolims=True)
_ = ax[1].set(xlabel='Paramerers', title='Fit time (with caching)')
_ = ax[1].set_xticklabels(labels = Temp, rotation=90, fontsize = 10)
fig.tight_layout()
del Temp, full_model, grid_model, PCA, logistic
See sklearn.tree.DecisionTreeClassifier for more details.
# Parameters
param_grid = {'criterion':['gini','entropy'], 'max_depth': np.arange(2,14)}
# Logistic Regression
dtc = DecisionTreeClassifier()
# Searching over specified parameter values for an estimator.
grid_model = GridSearchCV(dtc, param_grid, n_jobs=-1)
# Fitting
_ = grid_model.fit(X_train, y_train)
display(pd.DataFrame({'Best Score': [grid_model.best_score_],
'Best Paramerers': [str(grid_model.best_params_)],
'Accuracy': [grid_model.score(X_test,y_test)]}).round(4).style.hide_index())
display(pd.DataFrame(grid_model.cv_results_)[['rank_test_score',
'params','mean_test_score']].sort_values(by=['rank_test_score']).style.hide_index()\
.background_gradient(cmap='YlGn', subset=['mean_test_score']).set_precision(4))
Temp = [str(x) for x in grid_model.cv_results_['params']]
Temp = [s.replace('{', '') for s in Temp]
Temp = [s.replace('}', '') for s in Temp]
Temp = [s.replace("'", '') for s in Temp]
# Plot the results of the grid search.
fig, ax = plt.subplots(1, 2, figsize=(12.5, 7))
# left
_ = ax[0].errorbar(x= Temp,
y=grid_model.cv_results_['mean_test_score'],
yerr=grid_model.cv_results_['std_test_score'], uplims=True, lolims=True)
_ = ax[0].set(xlabel='Paramerers', title='Classification accuracy')
_ = ax[0].set_xticklabels(labels = Temp, rotation=90, fontsize = 10)
# Right
_ = ax[1].errorbar(x= Temp,
y=grid_model.cv_results_['mean_fit_time'],
yerr=grid_model.cv_results_['std_fit_time'], color='r', uplims=True, lolims=True)
_ = ax[1].set(xlabel='Paramerers', title='Fit time (with caching)')
_ = ax[1].set_xticklabels(labels = Temp, rotation=90, fontsize = 10)
fig.tight_layout()
del Temp, grid_model
# Parameters
param_grid = {'C': [1e3, 5e3, 1e4, 5e4, 1e5], 'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], }
# Support Vector Machine
svm = SVC(kernel='rbf', class_weight='balanced')
# Searching over specified parameter values for an estimator.
grid_model = GridSearchCV(svm, param_grid)
# Fitting
_ = grid_model.fit(X_train, y_train)
display(pd.DataFrame({'Best Score': [grid_model.best_score_],
'Best Paramerers': [str(grid_model.best_params_)],
'Accuracy': [grid_model.score(X_test,y_test)]}).round(4).style.hide_index())
display(pd.DataFrame(grid_model.cv_results_)[['rank_test_score',
'params','mean_test_score']].sort_values(by=['rank_test_score']).style.hide_index()\
.background_gradient(cmap='YlGn', subset=['mean_test_score']).set_precision(4))
Temp = [str(x) for x in grid_model.cv_results_['params']]
Temp = [s.replace('{', '') for s in Temp]
Temp = [s.replace('}', '') for s in Temp]
Temp = [s.replace("'", '') for s in Temp]
# Plot the results of the grid search.
fig, ax = plt.subplots(1, 2, figsize=(12.5, 7))
# left
_ = ax[0].errorbar(x= Temp,
y=grid_model.cv_results_['mean_test_score'],
yerr=grid_model.cv_results_['std_test_score'], uplims=True, lolims=True)
_ = ax[0].set(xlabel='Paramerers', title='Classification accuracy')
_ = ax[0].set_xticklabels(labels = Temp, rotation=90, fontsize = 10)
# Right
_ = ax[1].errorbar(x= Temp,
y=grid_model.cv_results_['mean_fit_time'],
yerr=grid_model.cv_results_['std_fit_time'], color='r', uplims=True, lolims=True)
_ = ax[1].set(xlabel='Paramerers', title='Fit time (with caching)')
_ = ax[1].set_xticklabels(labels = Temp, rotation=90, fontsize = 10)
fig.tight_layout()
del Temp, grid_model, svm
A random forest is a meta estimator that fits a number of decision tree classifiers on various sub-samples of the dataset and uses averaging to improve the predictive accuracy and control over-fitting. See sklearn.ensemble.RandomForestClassifier for more details.
# Parameters
param_grid = {'n_estimators': [n*100 for n in [2**m for m in np.arange(0,2)]],
'max_depth': list(np.arange(2,4)),
'min_samples_leaf': [10.0**x for x in np.arange(-1,-4,-1)]},
# Random Forest Classifier
rfc = RandomForestClassifier()
# Searching over specified parameter values for an estimator.
grid_model = GridSearchCV(rfc, param_grid)
# Fitting
_ = grid_model.fit(X_train, y_train)
display(pd.DataFrame({'Best Score': [grid_model.best_score_],
'Best Paramerers': [str(grid_model.best_params_)],
'Accuracy': [grid_model.score(X_test,y_test)]}).round(4).style.hide_index())
display(pd.DataFrame(grid_model.cv_results_)[['rank_test_score',
'params','mean_test_score']].sort_values(by=['rank_test_score']).style.hide_index()\
.background_gradient(cmap='YlGn', subset=['mean_test_score']).set_precision(4))
Temp = [str(x) for x in grid_model.cv_results_['params']]
Temp = [s.replace('{', '') for s in Temp]
Temp = [s.replace('}', '') for s in Temp]
Temp = [s.replace("'", '') for s in Temp]
# Plot the results of the grid search.
fig, ax = plt.subplots(1, 2, figsize=(12.5, 7))
# left
_ = ax[0].errorbar(x= Temp,
y=grid_model.cv_results_['mean_test_score'],
yerr=grid_model.cv_results_['std_test_score'], uplims=True, lolims=True)
_ = ax[0].set(xlabel='Paramerers', title='Classification accuracy')
_ = ax[0].set_xticklabels(labels = Temp, rotation=90, fontsize = 10)
# Right
_ = ax[1].errorbar(x= Temp,
y=grid_model.cv_results_['mean_fit_time'],
yerr=grid_model.cv_results_['std_fit_time'], color='r', uplims=True, lolims=True)
_ = ax[1].set(xlabel='Paramerers', title='Fit time (with caching)')
_ = ax[1].set_xticklabels(labels = Temp, rotation=90, fontsize = 10)
fig.tight_layout()
del Temp, grid_model, rfc
Gradient Boosting Classifier builds the model in a stage-wise fashion and it generalizes them by allowing optimization of an arbitrary differentiable loss function [Source]. See sklearn.ensemble.GradientBoostingClassifier for more details.
# Parameters
param_grid = {'loss': ['deviance', 'exponential'],
'learning_rate': [0.1, 0.2, 0.3],
'n_estimators': [100, 200],
'subsample': [0.5, 1.0]}
# Gradient Boosting Classifier
gbc = GradientBoostingClassifier()
# Searching over specified parameter values for an estimator.
grid_model = GridSearchCV(gbc, param_grid, n_jobs=-1)
# Fitting
_ = grid_model.fit(X_train, y_train)
display(pd.DataFrame({'Best Score': [grid_model.best_score_],
'Best Paramerers': [str(grid_model.best_params_)],
'Accuracy': [grid_model.score(X_test,y_test)]}).round(4).style.hide_index())
display(pd.DataFrame(grid_model.cv_results_)[['rank_test_score',
'params','mean_test_score']].sort_values(by=['rank_test_score']).style.hide_index()\
.background_gradient(cmap='YlGn', subset=['mean_test_score']).set_precision(4))
Temp = [str(x) for x in grid_model.cv_results_['params']]
Temp = [s.replace('{', '') for s in Temp]
Temp = [s.replace('}', '') for s in Temp]
Temp = [s.replace("'", '') for s in Temp]
# Plot the results of the grid search.
fig, ax = plt.subplots(1, 2, figsize=(12.5, 7))
# left
_ = ax[0].errorbar(x= Temp,
y=grid_model.cv_results_['mean_test_score'],
yerr=grid_model.cv_results_['std_test_score'], uplims=True, lolims=True)
_ = ax[0].set(xlabel='Paramerers', title='Classification accuracy')
_ = ax[0].set_xticklabels(labels = Temp, rotation=90, fontsize = 10)
# Right
_ = ax[1].errorbar(x= Temp,
y=grid_model.cv_results_['mean_fit_time'],
yerr=grid_model.cv_results_['std_fit_time'], color='r', uplims=True, lolims=True)
_ = ax[1].set(xlabel='Paramerers', title='Fit time (with caching)')
_ = ax[1].set_xticklabels(labels = Temp, rotation=90, fontsize = 10)
fig.tight_layout()
del Temp, grid_model, gbc
This model optimizes the log-loss function using LBFGS or stochastic gradient descent. See sklearn.neural_network.MLPClassifier.
# Parameters
param_grid = {'solver': ['lbfgs', 'sgd', 'adam'],
'alpha': [10.0**x for x in np.arange(-1,-4,-1)],
'learning_rate' : ['constant', 'invscaling', 'adaptive']}
# Multi-layer Perceptron classifier
mlp = MLPClassifier(max_iter = 1000)
# Searching over specified parameter values for an estimator.
grid_model = GridSearchCV(mlp, param_grid, n_jobs=-1)
# Fitting
_ = grid_model.fit(X_train, y_train)
display(pd.DataFrame({'Best Score': [grid_model.best_score_],
'Best Paramerers': [str(grid_model.best_params_)],
'Accuracy': [grid_model.score(X_test,y_test)]}).round(4).style.hide_index())
display(pd.DataFrame(grid_model.cv_results_)[['rank_test_score',
'params','mean_test_score']].sort_values(by=['rank_test_score']).style.hide_index()\
.background_gradient(cmap='YlGn', subset=['mean_test_score']).set_precision(4))
Temp = [str(x) for x in grid_model.cv_results_['params']]
Temp = [s.replace('{', '') for s in Temp]
Temp = [s.replace('}', '') for s in Temp]
Temp = [s.replace("'", '') for s in Temp]
# Plot the results of the grid search.
fig, ax = plt.subplots(1, 2, figsize=(12.5, 7))
# left
_ = ax[0].errorbar(x= Temp,
y=grid_model.cv_results_['mean_test_score'],
yerr=grid_model.cv_results_['std_test_score'], uplims=True, lolims=True)
_ = ax[0].set(xlabel='Paramerers', title='Classification accuracy')
_ = ax[0].set_xticklabels(labels = Temp, rotation=90, fontsize = 10)
# Right
_ = ax[1].errorbar(x= Temp,
y=grid_model.cv_results_['mean_fit_time'],
yerr=grid_model.cv_results_['std_fit_time'], color='r', uplims=True, lolims=True)
_ = ax[1].set(xlabel='Paramerers', title='Fit time (with caching)')
_ = ax[1].set_xticklabels(labels = Temp, rotation=90, fontsize = 10)
fig.tight_layout()
del Temp, grid_model
it seems that Gradient Boosting Classifier performing slightly better than the rest of the classification method in this study. All of these classification methods are tuned in a way that performs at their best by implementing GridSearchCV.